Code
#load necessary packages
library(vcfR)
library(SNPfiltR)
#read in unlinked SNP dataset
v <- read.vcfR("~/Desktop/nmel.ceyx.rad/filtered.unlinked.snps.vcf.gz")#load necessary packages
library(vcfR)
library(SNPfiltR)
#read in unlinked SNP dataset
v <- read.vcfR("~/Desktop/nmel.ceyx.rad/filtered.unlinked.snps.vcf.gz")#print vcf data
v***** Object of Class vcfR *****
107 samples
2457 CHROMs
2,580 variants
Object size: 22.4 Mb
3.187 percent missing data
***** ***** *****
#read in sampling details
sample.info<-read.csv("~/Desktop/nmel.ceyx.rad/ceyx.full.sampling.csv")
#retain only samples that passed all filtering protocols (assumes that the 'ID' column is identical to sample names in the vcf)
samps<-sample.info[sample.info$ID %in% colnames(v@gt)[-1],]
#reorder the sample info file to match the order of samples in the vcf
samps<-samps[match(colnames(v@gt)[-1],samps$ID),]
#make sure it worked (should be all true)
samps$ID == colnames(v@gt)[-1] [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[46] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[61] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[76] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[91] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[106] TRUE TRUE
samps$admix.assign<-as.factor(samps$admix.assign)
#use a for loop to randomly downsample 2 samples from each of the 11 lineages 3 separate times
for (i in 1:3){
#generate a set of 10 randomly selected samples (one from each species) using the sampling df called 'samps'
x<-c() #make an empty vector called 'x' to hold this list
for (j in 1:length(levels(samps$admix.assign))){
#if there's less than one sample for a species, just add that sample
if(length(samps$ID[samps$admix.assign == levels(samps$admix.assign)[j]]) == 1){
x<-c(x,samps$ID[samps$admix.assign == levels(samps$admix.assign)[j]])
}
#if there's more than one sample for a species, randomly sample two and add them to the vector
else{
x<-c(x,sample(samps$ID[samps$admix.assign == levels(samps$admix.assign)[j]], size=2))
}
}
#add "FORMAT" to 'x' so that the vcf info column is retained
x<-c(x, "FORMAT")
#subset the vcf to only the random samples plus the vcfR info (column 1)
vcf.sub <- v[,colnames(v@gt) %in% x]
#filter out invariant sites
vcf.comp<-min_mac(vcf.sub, min.mac = 1)
#write filtered subset vcf to disk
print(colnames(vcf.comp@gt)) #print sample names in retained vcf
print(samps$admix.assign[samps$ID %in% colnames(vcf.comp@gt)]) #print the assigned taxa for each sample to make sure we have subsampled correctly
#commented to prevent overwriting the exact vcfs used in this analysis
#vcfR::write.vcf(vcf.comp, file = paste0("~/Desktop/nmel.ceyx.rad/snapp/rep",i,".vcf.gz")) #write to disk
#write out a popmap that corresponds to this particular replicate:
#define dataframe
zz<-cbind.data.frame(samps$admix.assign[samps$ID %in% colnames(vcf.comp@gt)],colnames(vcf.comp@gt)[-1])
colnames(zz)<-c("species","individual") #fix colnames
#write to disk
#commented to prevent overwriting
#write.table(zz, file = paste0("~/Desktop/nmel.ceyx.rad/snapp/popmap.rep",i,".txt"), sep = "\t", quote = F, row.names = F)
}35.62% of SNPs fell below a minor allele count of 1 and were removed from the VCF
[1] "FORMAT" "C_solitarius_7604" "C_solitarius_9572"
[4] "C_nigromaxilla_15907" "C_margarethae_27254" "C_margarethae_27261"
[7] "C_sacerdotis_27646" "C_mulcatus_27674" "C_mulcatus_27686"
[10] "C_sacerdotis_29529" "C_malaitae_32770" "C_collectoris_33757"
[13] "C_collectoris_33789" "C_meeki_34848" "C_meeki_34855"
[16] "C_gentianus_34953" "C_gentianus_13530" "C_nigromaxilla_15892"
[19] "C_collectoris_33922" "C_collectoris_36115" "C_dispar_5611-2"
[22] "C_malaitae_32790-2"
[1] solitarius solitarius nigromaxilla margarethae margarethae
[6] sacerdotis mulcatus mulcatus sacerdotis malaitae
[11] coll.west coll.west meeki meeki gentianus
[16] gentianus nigromaxilla coll.east coll.east dispar
[21] malaitae
11 Levels: coll.east coll.west dispar gentianus malaitae margarethae ... solitarius
33.26% of SNPs fell below a minor allele count of 1 and were removed from the VCF
[1] "FORMAT" "C_sacerdotis_5317" "C_solitarius_9572"
[4] "C_margarethae_27245" "C_mulcatus_27674" "C_mulcatus_27746"
[7] "C_sacerdotis_29529" "C_meeki_32014" "C_malaitae_32770"
[10] "C_nigromaxilla_32846" "C_collectoris_33842" "C_meeki_34862"
[13] "C_gentianus_34953" "C_gentianus_35042" "C_collectoris_36076"
[16] "C_collectoris_36129" "C_solitarius_7295" "C_margarethae_14484"
[19] "C_nigromaxilla_15880" "C_collectoris_33863" "C_dispar_5611-2"
[22] "C_malaitae_32790-2"
[1] sacerdotis solitarius margarethae mulcatus mulcatus
[6] sacerdotis meeki malaitae nigromaxilla coll.west
[11] meeki gentianus gentianus coll.east coll.east
[16] solitarius margarethae nigromaxilla coll.west dispar
[21] malaitae
11 Levels: coll.east coll.west dispar gentianus malaitae margarethae ... solitarius
35.04% of SNPs fell below a minor allele count of 1 and were removed from the VCF
[1] "FORMAT" "C_solitarius_4846" "C_sacerdotis_5317"
[4] "C_solitarius_7629" "C_margarethae_27254" "C_sacerdotis_27646"
[7] "C_mulcatus_27686" "C_mulcatus_27839" "C_meeki_32007"
[10] "C_meeki_32024" "C_malaitae_32770" "C_nigromaxilla_32860"
[13] "C_collectoris_33759" "C_collectoris_33789" "C_gentianus_34953"
[16] "C_gentianus_34969" "C_collectoris_36096" "C_nigromaxilla_15880"
[19] "C_margarethae_19259" "C_collectoris_36115" "C_dispar_5611-2"
[22] "C_malaitae_32790-2"
[1] solitarius sacerdotis solitarius margarethae sacerdotis
[6] mulcatus mulcatus meeki meeki malaitae
[11] nigromaxilla coll.west coll.west gentianus gentianus
[16] coll.east nigromaxilla margarethae coll.east dispar
[21] malaitae
11 Levels: coll.east coll.west dispar gentianus malaitae margarethae ... solitarius
#read in last vcf file read to disk, to check that it looks right
z <- read.vcfR(paste0("~/Desktop/nmel.ceyx.rad/snapp/rep",i,".vcf.gz"))Scanning file to determine attributes.
File attributes:
meta lines: 14
header_line: 15
variant count: 1684
column count: 30
Meta line 14 read in.
All meta lines processed.
gt matrix initialized.
Character matrix gt created.
Character matrix gt rows: 1684
Character matrix gt cols: 30
skip: 0
nrows: 1684
row_num: 0
Processed variant 1000
Processed variant: 1684
All variants processed
colnames(z@gt) [1] "FORMAT" "C_sacerdotis_5307" "C_solitarius_5447-8"
[4] "C_nigromaxilla_15907" "C_margarethae_27254" "C_sacerdotis_27646"
[7] "C_mulcatus_27674" "C_mulcatus_27852" "C_meeki_32023"
[10] "C_malaitae_32770" "C_collectoris_33789" "C_collectoris_33871"
[13] "C_meeki_34870" "C_gentianus_34953" "C_gentianus_34969"
[16] "C_collectoris_36072" "C_collectoris_36212" "C_solitarius_6982"
[19] "C_nigromaxilla_15892" "C_margarethae_19259" "C_dispar_5611-2"
[22] "C_malaitae_32790-2"
z***** Object of Class vcfR *****
21 samples
1630 CHROMs
1,684 variants
Object size: 3.7 Mb
4.044 percent missing data
***** ***** *****
#read in last popmap read to disk, to check that it looks right
read.table(paste0("~/Desktop/nmel.ceyx.rad/snapp/popmap.rep",i,".txt")) V1 V2
1 species individual
2 sacerdotis C_sacerdotis_5307
3 solitarius C_solitarius_5447-8
4 nigromaxilla C_nigromaxilla_15907
5 margarethae C_margarethae_27254
6 sacerdotis C_sacerdotis_27646
7 mulcatus C_mulcatus_27674
8 mulcatus C_mulcatus_27852
9 meeki C_meeki_32023
10 malaitae C_malaitae_32770
11 coll.west C_collectoris_33789
12 coll.west C_collectoris_33871
13 meeki C_meeki_34870
14 gentianus C_gentianus_34953
15 gentianus C_gentianus_34969
16 coll.east C_collectoris_36072
17 coll.east C_collectoris_36212
18 solitarius C_solitarius_6982
19 nigromaxilla C_nigromaxilla_15892
20 margarethae C_margarethae_19259
21 dispar C_dispar_5611-2
22 malaitae C_malaitae_32790-2
#paper that this distribution is based on can be found here: https://royalsocietypublishing.org/doi/full/10.1098/rspb.2019.0122
#use the BEAUTI2 GUI to visualize a distribution where 95% of the probability density curve approximates the 95% HPD estimated from the above paper:
knitr::include_graphics("/Users/devonderaad/Desktop/beauti.lognormal.95.png")#ideal parameters are mean = 3.9, stdev = 0.15#make date constraint file called 'date.con.txt'
#make line 1: distribution of the node constraint, the word "crown", and the list of species that are in this crown clade
date.con<-c("lognormal(0,3.9,.15)","crown","solitarius,nigromaxilla,sacerdotis,mulcatus,meeki,malaitae,coll.west,coll.east,gentianus,dispar")
#make line 2: the word "monophyletic", "NA", and the list of species to enforce as monophyletic
date.con<-rbind.data.frame(date.con,c("monophyletic","NA","solitarius,nigromaxilla,sacerdotis,mulcatus,meeki,malaitae,coll.west,coll.east,gentianus,dispar"))
#write to disk
#write.table(date.con, file = paste0("~/Desktop/nmel.ceyx.rad/snapp/date.con.txt"), sep = "\t", quote = F, row.names = F, col.names = F)
#now make a starting tree file called tre.start.txt based on the concatenated ML topology
#write tree
t<-"(margarethae,((mulcatus,solitarius),(sacerdotis,(dispar,((malaitae,meeki),(gentianus,(nigromaxilla,(coll.east,coll.west))))))))"
#write.table(t, file = paste0("~/Desktop/nmel.ceyx.rad/snapp/tre.start.txt"), sep = "\t", quote = F, row.names = F, col.names = F)Notes on running SNAPP or Snapper with a date constrained node, starting tree, and constraint tree. What I’ve put together here is based on the following two resources:
tutorial on divergence time estimation using the ruby scrpt snapp_prep: https://github.com/mmatschiner/tutorials/blob/master/divergence_time_estimation_with_snp_data/README.md
github page for that script: https://github.com/mmatschiner/snapp_prep
My workflow is as follows, and below is the most “constrained” version I’ve run. The simplest version is just with a date constraint.
#when I run this ruby script it references four files
#-v (input data; thinned VCF)
#-c (date constraint file; this includes the topology constraint)
#-s (starting tree file)
#-t (popmap)
#plus, three flags
#-x (name of the output xml file including a .xml extension)
#-o (the name prefix to append to log and tree files)
#-l (MCMC length)
#the date file is called 'date.con.txt' and it looks like this:
#lognormal(0,0.56,0.10) crown kul,par,tet,spl
#monophyletic NA eic,gri,lon,mur
#this basically says, put a date constraint on the node that comprises those four tips (kul,par,tet,spl)
#and your constraint is centered on 0.56 MA, with a stdev of 0.10, and zero offset, and your distribution is lognormal
#the second line is the tree constraint which says keep that group of four tips monophyletic
#to enforce a root, you could just force all your ingroup tips to be monophyletic
#the popmap file is a standard tab-separated popmap, e.g.,:
#species individual
#eic Z_gr329017.sorted
#pal Z_gr330116.sorted
#eic Z_gr335071.sorted
#finally, specify a starting tree (not a constraint tree) which must match the topology specified in the date constraint file above, i.e., you do have to specify a start tree in order to have overlapping starting space with your other topological constraints. Example:
#((vel,(lut,((spl,(tet,par)),kul))),(pal,(mur,(eic,(gri,lon)))))
#once you have all these files in one directory you simply run the below text to make five replicate input xml files
#download ruby script
wget https://raw.githubusercontent.com/mmatschiner/snapp_prep/master/snapp_prep.rb
#view help menu
ruby snapp_prep.rb -h
#unzip each output vcf
gunzip rep1.vcf.gz
gunzip rep2.vcf.gz
gunzip rep3.vcf.gz
#prepare each input .xml
ruby snapp_prep.rb -a SNAPP -v rep1.vcf -t popmap.rep1.txt -c date.con.txt -s tre.start.txt -x rep1.xml -o rep1 -l 5000000
ruby snapp_prep.rb -a SNAPP -v rep2.vcf -t popmap.rep2.txt -c date.con.txt -s tre.start.txt -x rep2.xml -o rep2 -l 5000000
ruby snapp_prep.rb -a SNAPP -v rep3.vcf -t popmap.rep3.txt -c date.con.txt -s tre.start.txt -x rep3.xml -o rep3 -l 5000000
#and if you wanted to produce an xml that will run Snapper instead of SNAPP you just add the "-a SNAPPER" flag included, see the snapp_prep.rb help menu#!/bin/sh
#
#SBATCH --job-name=snapp # Job Name
#SBATCH --nodes=1 # 40 nodes
#SBATCH --ntasks-per-node=25 # 40 CPU allocation per Task
#SBATCH --partition=bi # Name of the Slurm partition used
#SBATCH --chdir=/home/d669d153/work/nmel.ceyx/snapp # Set working d$
#SBATCH --mem-per-cpu=800 # memory requested
#SBATCH --array=1-3
#SBATCH --time=10000
#run beast 2.7.1
/home/d669d153/work/beast.2.7.1/beast/bin/beast -threads 25 rep$SLURM_ARRAY_TASK_ID.xml#Determine which of the replicates (here, rep 3) had the greatest posterior probability and should be presented in the paper
knitr::include_graphics("/Users/devonderaad/Desktop/nmel.ceyx.rad/snapp/trace.posteriors.png")#Determine which of the replicates (here, rep 3) had the greatest posterior probability and should be presented in the paper
knitr::include_graphics("/Users/devonderaad/Desktop/nmel.ceyx.rad/snapp/rep3.trace.png")#rep1
knitr::include_graphics("/Users/devonderaad/Desktop/nmel.ceyx.rad/snapp/rep1.png")#rep2
knitr::include_graphics("/Users/devonderaad/Desktop/nmel.ceyx.rad/snapp/rep2.png")#rep3
knitr::include_graphics("/Users/devonderaad/Desktop/nmel.ceyx.rad/snapp/rep3.png")knitr::include_graphics("/Users/devonderaad/Desktop/nmel.ceyx.rad/snapp/rep3.contree.png")